#!/usr/bin/env python
# -*- coding:utf-8 -*-

import os
import re
import sys

import jieba.posseg as pseg

reload(sys)
sys.setdefaultencoding('utf-8')

__all__ = ['name_extraction']
CURRENT_PATH = os.path.dirname(__file__) + "/"
SUB_PATTERN = re.compile(ur'\(|\)|（|）')
city2provFile = CURRENT_PATH + 'listProvCity.py'
dictCity2Prov = {}
provList = set()
cityList = set()

stopList = [u'有限', u'责任', u'公司', u'股份']
endWords = [u'局', u'厂', u'行', u'店', u'办', u'部', u'会', u'队', u'集团', u'院', u'校', u'学', u'园', u'中心']
specialWords = [u'第一', u'第二', u'第三', u'第四', u'第五', u'第六', u'第七', u'第八', u'第九', u'第十']

with open(city2provFile) as f:
    for line in f:
        line = line.split(',')
        dictCity2Prov[line[1].strip()] = unicode(line[0])
        provList.add(unicode(line[0]))
        cityList.add(unicode(line[1].strip()))


def provCityName(name):
    return [x for x in cityList if x in name] or [x for x in provList if x in name]


def name_extraction(companyName):
    if not companyName: return []
    if isinstance(companyName, str):
        companyName = unicode(companyName, 'utf-8')
    elif not isinstance(companyName, unicode):
        return []
    try:
        nameList = [companyName.strip()]
        specTmp = [x for x in specialWords if x in companyName]
        if specTmp:
            return nameList

        companyName = removeBracket(companyName)
        companyName = companyName.split(u'公司')[0]
        seg_list = list(pseg.cut(companyName))
        tmp_list = seg_list  # [0:-1]
        endW = [x for x in endWords if x in tmp_list[-1].word]
        if endW and len(tmp_list) > 1:
            tmp_list.pop(-1)

        wordList = []
        flagList = []
        for tmp_list_word in tmp_list:
            tmp = [x for x in stopList if x in tmp_list_word.word]
            if not tmp:
                wordList.append(tmp_list_word.word)
                flagList.append(tmp_list_word.flag)

        cityName = [x for x in cityList if x in companyName]
        provName = [x for x in provList if x in companyName]

        label = 1
        while label:
            label = 0
            for i in range(len(wordList) - 1):
                if wordList[i] == u'省' or wordList[i] == u'市' or wordList[i] == u'区' or wordList[i] == u'县':
                    wordList[i - 1] += wordList[i]
                    wordList.pop(i)
                    label = 1
                    break
                if len(wordList[i]) == 1:  # and len(wordList[i+1]) == 1
                    tmpFlag = flagList[i]
                    if tmpFlag == 'a' or i == 0 or (i > 0 and provCityName(wordList[i - 1])):
                        wordList[i] += wordList[i + 1]
                        wordList.pop(i + 1)
                        label = 1
                        break
                    else:
                        wordList[i - 1] += wordList[i]
                        wordList.pop(i)
                        label = 1
                        break
        # print json.dumps(wordList,ensure_ascii=False)
        if len(wordList) == 1:
            nameList.append(wordList[0])
        elif cityName or provName or (u'中国' in companyName):
            sub_city = [x for x in cityList if x in wordList[1]]
            if sub_city and len(wordList) > 3:
                nameList.append(wordList[2] + wordList[3])
            elif len(wordList) > 2:
                nameList.append(wordList[0] + wordList[1])
                nameList.append(wordList[1] + wordList[2])
                nameList.append(wordList[0] + wordList[1] + wordList[2])
            elif len(wordList) > 1:
                nameList.append(wordList[0] + wordList[1])
            else:
                nameList.append(wordList[0])
        elif len(wordList) > 1:
            nameList.append(wordList[0] + wordList[1])
        elif len(wordList) > 0:
            nameList.append(wordList[0])
        tmpNameList = [name for name in nameList if name not in provList]
        tmpNameList = [name for name in tmpNameList if name not in cityList]
        tmpNameList = filter(lambda x: len(x) >= 2, tmpNameList)

        return list(set(tmpNameList))
    except:
        return [companyName.strip()]


def removeBracket(companyName):
    return SUB_PATTERN.sub('', companyName)


if __name__ == '__main__':
    print '\n'.join(name_extraction(u'杭州誉存科技有限公司'))
